library(pbapply)
library(EnsDb.Hsapiens.v79)
## Loading required package: ensembldb
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: AnnotationFilter
## 
## Attaching package: 'ensembldb'
## The following object is masked from 'package:stats':
## 
##     filter
dropout_percent <- c(0,5,10,20,30,40,50,75,80,85,90,95)
data_in <- sapply(dropout_percent, function(y) paste0("/Users/elise/Desktop/GitHub/Hubness_sc/bulkRNAseq/simul",y,".csv"))
data_out <- sapply(dropout_percent, function(y) paste0("/Users/elise/Desktop/GitHub/Hubness_sc/bulkRNAseq/simul_pca_readyforhubness",y,".csv"))
library(circlize)
## ========================================
## circlize version 0.4.9
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.4.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(ggplot2)
zero_percent <- function(data) { # gene x cell matrix
  return(sum(data==0)/ncol(data)/nrow(data)*100)
}
data_prep <- function(data) {
  data <- log10(data+1)
  return(data)
}
dist_heatmap <- function(dist) {
  col_fun = colorRamp2(c(0,0.5,1),heat.colors(n=3))
  Heatmap(as.matrix(dist), cluster_rows = T, cluster_columns = T, col = col_fun,
        border = T, show_column_names = F, show_row_names = F, name = "Euclidean distance")
}
max_min_ratio <- function(dist) {
  tmp <- as.matrix(dist)
  diag(tmp) <- NA
  ratios <- apply(tmp, 1, function(x) max(x, na.rm = T)/min(x, na.rm = T))
  return(ratios)
}
relative_contrast <- function(dist) {
  tmp <- as.matrix(dist)
  diag(tmp) <- NA
  rc <- apply(tmp, 1, function(x) abs(max(x, na.rm = T)-min(x, na.rm = T))/min(x, na.rm = T))
  return(rc)
}
data <- pblapply(X=data_in, FUN=function(x) read.table( file=x)) 
data <- pblapply(data, function(x) return(x[grep("^ENS",rownames(x)),])) # 57760 x 1213
data <- pblapply(data, function(x) {rownames(x) <- gsub("\\..*","",rownames(x)); return(x)})
# Sparsity
dropout <- sapply(data,function(x) zero_percent(x)) # 46%, 49%, 55%, 60%, 66%, 72%, 86%, 88%, 92%, 94%, 97%
# Replace with genes names, remove duplicates
genes <- ensembldb::select(EnsDb.Hsapiens.v79, keys=rownames(data[[1]]), keytype = "GENEID", columns = c("SYMBOL"))$SYMBOL
duplicates <- duplicated(genes)
data <- pblapply(data, function(x) {x <- x[!duplicates,];
                                    rownames(x) <- unique(genes);
                                    return(x)}) # 56029 x 1213
# Keep 10k HVG
data <- pblapply(data, function(x) {hvg <- names(sort(apply(x,1,var), decreasing = T)[1:1e4]); return(x[hvg,])})

# make the distance heatmap
pairwise_dist <- pblapply(data, function(x) dist(t(x)))
pairwise_dist_norm <- pblapply(pairwise_dist, function(x) x/max(x))
pblapply(pairwise_dist_norm, dist_heatmap)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

# look at relative contrast
ratio <- pbsapply(pairwise_dist_norm, max_min_ratio)
rc <- pbsapply(pairwise_dist_norm, relative_contrast)
n_cell <- nrow(rc)
rc <- data.frame("Relative_contrast"=as.vector(rc),
                 "Dropout_rate"=rep(dropout_percent,each=n_cell))
ratio <- data.frame("Ratio"=as.vector(ratio),
                 "Dropout_rate"=rep(dropout_percent,each=n_cell))
ggplot(rc, aes(x=Relative_contrast, fill=factor(Dropout_rate))) +
  geom_density(aes(group=Dropout_rate), alpha=0.5)

ggplot(ratio, aes(x=Ratio, fill=factor(Dropout_rate))) +
  geom_density(aes(group=Dropout_rate), alpha=0.5)

nPC = 10
data_pca <- pblapply(X=data_out, FUN=function(x) read.table(file=x))
pca <- pblapply(data_pca, function(x) return(x[1:nPC,]))
# make the distance heatmap
pairwise_dist_pca <- pblapply(pca, function(x) dist(t(x)))
pairwise_dist_norm_pca <- pblapply(pairwise_dist_pca, function(x) x/max(x))
pblapply(pairwise_dist_norm_pca, dist_heatmap)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

# look at relative contrast
ratio_pca <- pbsapply(pairwise_dist_norm_pca, max_min_ratio)
rc_pca <- pbsapply(pairwise_dist_norm_pca, relative_contrast)
n_cell <- nrow(rc_pca)
rc_pca <- data.frame("Relative_contrast"=as.vector(rc_pca),
                 "Dropout_rate"=rep(dropout_percent,each=n_cell))
ratio_pca <- data.frame("Ratio"=as.vector(ratio_pca),
                 "Dropout_rate"=rep(dropout_percent,each=n_cell))
ggplot(rc_pca, aes(x=Relative_contrast, fill=factor(Dropout_rate))) +
  geom_density(aes(group=Dropout_rate), alpha=0.5)

ggplot(ratio_pca, aes(x=Ratio, fill=factor(Dropout_rate))) +
  geom_density(aes(group=Dropout_rate), alpha=0.5)

total_rc <- rbind(rc_pca,rc)
total_rc$Dimension <- rep(c(nPC,1e4), each=nrow(rc))
total_ratio <- rbind(ratio_pca, ratio)
total_ratio$Dimension <- rep(c(nPC,1e4), each=nrow(ratio))
ggplot(total_rc, aes(x=factor(Dimension), y=Relative_contrast)) +
  geom_violin(aes(group=Dimension, fill=factor(Dimension))) +
  xlab("Dimension 10 vs 10k") +
  geom_line(aes(group=Dropout_rate), alpha=0.5)

ggplot(total_ratio, aes(x=factor(Dimension), y=Ratio)) +
  geom_violin(aes(group=Dimension, fill=factor(Dimension))) +
  xlab("Dimension 10 vs 10k") +
  ylab("Ratio min/max") +
  geom_line(aes(group=Dropout_rate), alpha=0.5)